#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
# @Description 30W新闻文本数据清洗
# @Time : 2020/1/4 19:35 
# @Author : sky 
# @Site :  
# @File : 30wClear.py 
# @Software: PyCharm
"""

import os
import re
import time
from ClearText.REdealText import text_parse


# 高效读取文件
# 迭代器
class LoadFolders(object):
    def __init__(self, parent_path):
        self.parent_path = parent_path

    def __iter__(self):
        for file in os.listdir(self.parent_path):
            file_abspath = os.path.join(self.parent_path, file)
            if os.path.isdir(file_abspath):
                yield file_abspath


class LoadFiles(object):
    def __init__(self, parent_path):
        self.parent_path = parent_path

    def __iter__(self):
        folders = LoadFolders(self.parent_path)
        # 第一级目录
        for folder in folders:
            category = folder.split(os.sep)[-1]
            # 第二级目录
            for file in os.listdir(folder):
                file_path = os.path.join(folder, file)
                if os.path.isfile(file_path):
                    this_file = open(file_path, 'rb')  # rb读取快
                    content = this_file.read().decode('utf-8')
                yield category, content
                this_file.close()


if __name__ == '__main__':
    start_time = time.time()

    file_path = '../dataset/CSCMNews'
    files = LoadFiles(file_path)
    # 抽样
    n = 2
    for index, msg in enumerate(files):
        if index % n == 0:
            category = msg[0]
            content = msg[1]
            content = text_parse(content)
            if int(index / n) % 10000 == 0:
                print(
                    '{t} *** {i} \t docs has bean dealed'.format(t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),
                                                                 i=index), '\n', category, '\t', content[:20])

    end_time = time.time()
    print('Total Cost Time %.2f' % (end_time - start_time) + 's')
